@inproceedings{zaree-etal-2025-attention,
title = "Attention Eclipse: Manipulating Attention to Bypass {LLM} Safety-Alignment",
author = "Zaree, Pedram and
Al Mamun, Md Abdullah and
Alam, Quazi Mishkatul and
Dong, Yue and
Alouani, Ihsen and
Abu-Ghazaleh, Nael",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.842/",
doi = "10.18653/v1/2025.emnlp-main.842",
pages = "16648--16668",
ISBN = "979-8-89176-332-6",
abstract = "Recent research has shown that carefully crafted jailbreak inputs can induce large language models to produce harmful outputs, despite safety measures such as alignment. It is important to anticipate the range of potential Jailbreak attacks to guide effective defenses and accurate assessment of model safety. In this paper, we present a new approach for generating highly effective Jailbreak attacks that manipulate the attention of the model to selectively strengthen or weaken attention among different parts of the prompt. By harnessing attention loss, we develop more effective jailbreak attacks, that are also transferrable. The attacks amplify the success rate of existing Jailbreak algorithms, including GCG, AutoDAN, and ReNeLLM, while lowering their generation cost (for example, the amplified GCG attack achieves 91.2{\%} ASR, vs. 67.9{\%} for the original attack on Llama2-7B-chat/AdvBench, using less than a third of the generation time)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zaree-etal-2025-attention">
<titleInfo>
<title>Attention Eclipse: Manipulating Attention to Bypass LLM Safety-Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pedram</namePart>
<namePart type="family">Zaree</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Abdullah</namePart>
<namePart type="family">Al Mamun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quazi</namePart>
<namePart type="given">Mishkatul</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ihsen</namePart>
<namePart type="family">Alouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nael</namePart>
<namePart type="family">Abu-Ghazaleh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Recent research has shown that carefully crafted jailbreak inputs can induce large language models to produce harmful outputs, despite safety measures such as alignment. It is important to anticipate the range of potential Jailbreak attacks to guide effective defenses and accurate assessment of model safety. In this paper, we present a new approach for generating highly effective Jailbreak attacks that manipulate the attention of the model to selectively strengthen or weaken attention among different parts of the prompt. By harnessing attention loss, we develop more effective jailbreak attacks, that are also transferrable. The attacks amplify the success rate of existing Jailbreak algorithms, including GCG, AutoDAN, and ReNeLLM, while lowering their generation cost (for example, the amplified GCG attack achieves 91.2% ASR, vs. 67.9% for the original attack on Llama2-7B-chat/AdvBench, using less than a third of the generation time).</abstract>
<identifier type="citekey">zaree-etal-2025-attention</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.842</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.842/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>16648</start>
<end>16668</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Attention Eclipse: Manipulating Attention to Bypass LLM Safety-Alignment
%A Zaree, Pedram
%A Al Mamun, Md Abdullah
%A Alam, Quazi Mishkatul
%A Dong, Yue
%A Alouani, Ihsen
%A Abu-Ghazaleh, Nael
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F zaree-etal-2025-attention
%X Recent research has shown that carefully crafted jailbreak inputs can induce large language models to produce harmful outputs, despite safety measures such as alignment. It is important to anticipate the range of potential Jailbreak attacks to guide effective defenses and accurate assessment of model safety. In this paper, we present a new approach for generating highly effective Jailbreak attacks that manipulate the attention of the model to selectively strengthen or weaken attention among different parts of the prompt. By harnessing attention loss, we develop more effective jailbreak attacks, that are also transferrable. The attacks amplify the success rate of existing Jailbreak algorithms, including GCG, AutoDAN, and ReNeLLM, while lowering their generation cost (for example, the amplified GCG attack achieves 91.2% ASR, vs. 67.9% for the original attack on Llama2-7B-chat/AdvBench, using less than a third of the generation time).
%R 10.18653/v1/2025.emnlp-main.842
%U https://aclanthology.org/2025.emnlp-main.842/
%U https://doi.org/10.18653/v1/2025.emnlp-main.842
%P 16648-16668
Markdown (Informal)
[Attention Eclipse: Manipulating Attention to Bypass LLM Safety-Alignment](https://aclanthology.org/2025.emnlp-main.842/) (Zaree et al., EMNLP 2025)
ACL